{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import codecs\n",
    "import csv\n",
    "import time\n",
    "import os\n",
    "import re\n",
    "import gzip\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pack_and_remove(file_name):\n",
    "    file_name_new = file_name+'.gz'\n",
    "    with open(file_name, 'rb') as f_in, gzip.open(file_name_new, 'wb') as f_out:\n",
    "        f_out.writelines(f_in)\n",
    "    os.remove(file_name)\n",
    "    return file_name_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def unpack(file_name):\n",
    "    file_name_new = file_name.replace(\".gz\",\"\")\n",
    "    with gzip.open(file_name, 'rb') as f_in, open(file_name_new, 'wb') as f_out:\n",
    "        f_out.writelines(f_in)\n",
    "    return file_name_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "PATH_TO_DATA = 'Data/'\n",
    "PATH_TO_DATA_UK = PATH_TO_DATA+\"ukwiki/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data/ukwiki/ukwiki-20180620-pages-meta-current.xml_art.csv.gz\n"
     ]
    }
   ],
   "source": [
    "fn = 'Data/ukwiki/ukwiki-20180620-pages-meta-current.xml_art.csv.gz'\n",
    "print(fn)\n",
    "fn_new = unpack(fn)\n",
    "df_articles = pd.read_csv(fn_new, encoding='UTF-8', quotechar=\"\\\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_articles_array = np.array_split(df_articles, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data/ukwiki/ukwiki-20180620-pages-meta-current01-p1p5503930.xml_art.csv\n",
      "Data/ukwiki/ukwiki-20180620-pages-meta-current02-p5503931p11007860.xml_art.csv\n",
      "Data/ukwiki/ukwiki-20180620-pages-meta-current03-p11007861p16511790.xml_art.csv\n",
      "Data/ukwiki/ukwiki-20180620-pages-meta-current04-p16511791p22015720.xml_art.csv\n"
     ]
    }
   ],
   "source": [
    "counter = 1\n",
    "from_i = 1\n",
    "for df in df_articles_array:\n",
    "    to_i = from_i + df['id'].count()+1\n",
    "    fn = \"Data/ukwiki/ukwiki-20180620-pages-meta-current{:02}-p{}p{}.xml_art.csv\".format(counter,from_i,to_i)\n",
    "    print(fn)\n",
    "    df.to_csv(fn, index = False, encoding='UTF-8', quotechar=\"\\\"\")\n",
    "    pack_and_remove(fn)\n",
    "    from_i = to_i + 1\n",
    "    counter+=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = 'Data/ukwiki/ukwiki-20180620-pages-meta-current.xml_red.csv.gz'\n",
    "print(fn)\n",
    "fn_new = unpack(fn)\n",
    "df_redirects = pd.read_csv(fn_new, encoding='UTF-8', quotechar=\"\\\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_redirects_array = np.array_split(df_redirects, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "counter = 1\n",
    "from_i = 1\n",
    "for df in df_redirects_array:\n",
    "    to_i = from_i + df['id'].count()+1\n",
    "    fn = \"Data/ukwiki/ukwiki-20180620-pages-meta-current{:02}-p{}p{}.xml_red.csv\".format(counter,from_i,to_i)\n",
    "    print(fn)\n",
    "    df.to_csv(fn, index = False, encoding='UTF-8', quotechar=\"\\\"\")\n",
    "    pack_and_remove(fn)\n",
    "    from_i = to_i + 1\n",
    "    counter+=1"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
